#!/usr/bin/env python
# -*- coding:utf-8 -*-

""" 
:Description: 从文件中创建DataFrame
:Owner: leo_jie
:Create time: 2019/12/6
"""

from pyspark.sql import SparkSession
from pyspark.sql import Row

spark = SparkSession.builder.master("local") \
    .appName("leo_pyspark_learning") \
    .getOrCreate()

# 利用反射模式，从一个文件中生成RDD转换为DataFrame

file_path = r"file:///D:\leo_python_project\leo_pyspark_learning\pyspark_sql_example\doc\test.csv"

"""
# 文本内容
1,leo,23
2,leo2,22
"""


def f(x):
    schema = {'id': x[0], 'name': x[1], 'age': int(x[2])}
    return schema


df = spark.sparkContext.textFile(file_path) \
    .map(lambda line: line.split(',')).map(lambda x: Row(**f(x))).toDF()

"""
df.printSchema()
root
 |-- age: long (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
"""

"""
df.show()
+---+---+----+
|age| id|name|
+---+---+----+
| 23|  1| leo|
| 22|  2|leo2|
+---+---+----+
"""

spark.stop()
